""" Merge all the well data together

"""
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from datetime import datetime, timedelta

#%% SETUP CONFIGURATION -----------------------------------------------------------------
data_folder = 'data_py'

files = [
    'boreholes',
    'permits_non_ewell_main',
    'permits_ewell_main',
    'permits_ewell_casing_intervals',
    'permits_ewell_casing_sections',
    'bottomhole_pressure',
    'war_main',
    'war_main_prop',
    'war_tubular_summaries',
    'war_tubular_summaries_prop',
    'lease'
]
for i in range(2000, 2016):
    files.append(f'ogora{i}delimit')

keep_columns_ewell = {
    'sn_apd': 'permit_id',
    'csng_intv_num': 'interval_num',
    'casing_section_num': 'section_num',
    'csng_mud_wgt_ppg': 'mud_weight',
    'kickoff_point_md': 'kickoff_point_md',
    'api_well_number': 'api',
    'mms_company_num': 'mms',
    'permit_type': 'permit_type',
    'rig_name': 'rig_name',
    'rig_type_code': 'rig_type',
    'rig_id_num': 'rig_id_num',
    'rated_wtr_depth': 'rated_water_depth',
    'bus_asc_name': 'bus_asc_name',
    'apd_sub_status_dt': 'sub_date'
}

keep_columns_non_ewells = {
    'api_well_number': 'api',
    'mms_company_num': 'mms',
    'rig_name': 'rig_name',
    'rig_id_num': 'rig_id_num',
    'bus_asc_name': 'bus_asc_name',
    'apd_received_dt': 'sub_date'
}

keep_columns_boreholes = {
    'api_well_number': 'api',
    'botm_lease_number': 'botm_lease_number',
    'surf_lease_number': 'surf_lease_number',
    'company_name': 'company_name',
    'well_spud_date': 'spud_date',
    'total_depth_date': 'depth_date',
    'well_bore_tvd': 'vertical_depth',
    'bh_total_md': 'total_depth',
    'well_td_ss': 'subsea_depth',
    'well_bp_st_kickoff_md': 'kickoff',
    'water_depth': 'water_depth',
    'surf_latitude': 'surf_latitude',
    'surf_longitude': 'surf_longitude',
    'botm_latitude': 'botm_latitude',
    'botm_longitude': 'botm_longitude',
    'botm_area_code': 'field'
}

keep_columns_war = {
    'sn_war': 'sn_war',
    'api_well_number': 'api',
    'drilling_tvd': 'drilling_tvd',
    'drill_fluid_wgt': 'drill_fluid_wgt',
    'war_start_dt': 'war_start_dt',
    'war_end_dt': 'war_end_dt'
}

keep_columns_war_casings = {
    'sn_war': 'sn_war',
    'sn_war_csng_intv': 'sn_war_csng_intv',
    'api_well_number': 'api',
    'csng_setting_top_md': 'csng_setting_top_md',
    'csng_setting_botm_md': 'csng_setting_botm_md',
}

keep_columns_permit = [
    'api',
    'mms',
    'company_name',
    'bus_asc_name',
    'permit_id',
    'permit_type',
    'rig_name',
    'rig_type',
    'rig_id_num',
    'n_strings',
    'max_mud_weight',
    'spud_date',
    'depth_date',
    'kickoff',
    'total_depth',
    'vertical_depth',
    'water_depth',
    'surf_lease_number',
    'botm_lease_number',
    'surf_latitude',
    'surf_longitude',
    'botm_latitude',
    'botm_longitude',
    'bid',
    'effective_date',
    'oil_volume',
    'gas_volume',
    # 'deepening',
    'deepened_candidate',
    'field'
]

#%% READ IN DATA ------------------------------------------------------------------------
df = dict()
for file in files:
    df[file] = pd.read_csv(
        f"./{data_folder}/temp/01_unzip/raw_csv/{file}.csv",
        index_col=[0]
    )
    df[file].columns = df[file].columns.str.lower()


#%% MERGE EWELL PERMIT TOGETHER ---------------------------------------------------------
df['permits_ewell_clean'] = (
    df['permits_ewell_main']
    .merge(
        df['permits_ewell_casing_intervals'],
        left_on='sn_apd',
        right_on='sn_apd_fk',
        validate='1:m',
        how='inner'  # Some permits are not merged (if use 'left')...
    )
    .merge(
        df['permits_ewell_casing_sections'],
        left_on='sn_apd_csg_intv',
        right_on='sn_apd_csng_intv_fk',
        validate='1:m',
        how='left',
        indicator=True,
    )
    .loc[:, keep_columns_ewell.keys()]
    .rename(columns=keep_columns_ewell)
    .assign(final_sub_date=
        (lambda x: x.groupby(['api'], as_index=False)['sub_date'].transform(max))
    )
    .query('final_sub_date == sub_date')
    .assign(
        n_strings_permit=(lambda x:
            x.groupby(['api'], as_index=False)['interval_num'].transform(len)),
        max_mud_weight_permit=(lambda x:
            x.groupby(['api'], as_index=False)['mud_weight'].transform(max))
    )
    .drop(['interval_num', 'section_num', 'mud_weight'], axis=1)
    .drop_duplicates('api')
)

#%% CLEANUP NAMES OF EWELL PERMITS ------------------------------------------------------
df['permits_non_ewell_clean'] = (
    df['permits_non_ewell_main']
    .loc[:, keep_columns_non_ewells.keys()]
    .rename(columns=keep_columns_non_ewells)
    .replace({
        'CANCELED': np.nan,
        '42709409410K': np.nan,
        '42706404130K': np.nan
    })
    .dropna(subset=['api'])
    .assign(api=lambda x: x['api'].astype(int))
)
df['permits_non_ewell_clean'] = (
    df['permits_non_ewell_clean']
    .loc[
        ~df['permits_non_ewell_clean']['api']
        .isin(
            list(df['permits_ewell_clean']['api'].values)
        )
    ]
)

#%% CLEANUP BOTTOMHOLE PRESSURE DATA ----------------------------------------------------
df['bottomhole_pressure_clean'] = (
    df['bottomhole_pressure']
    .assign(
        pressure_max=lambda x: (
            x.groupby(['api_well_number'], as_index=False)['bhtst_pressure']
            .transform(max)
        )
    )
    .drop_duplicates('api_well_number')
    .loc[:, ['api_well_number', 'pressure_max']]
    .rename(columns={'api_well_number': 'api'})
)

#%% CLEANUP BOREHOLE DATA ---------------------------------------------------------------
df['boreholes_clean'] = (
    df['boreholes']
    .loc[:, keep_columns_boreholes.keys()]
    .rename(columns=keep_columns_boreholes)
    .assign(spud_date=lambda x: pd.to_datetime(x['spud_date']))
    .assign(depth_date=lambda x: pd.to_datetime(x['depth_date']))
    .merge(
        df['bottomhole_pressure_clean'],
        on='api',
        how='left'
    )
)
df['boreholes_clean']['botm_lease_number'] = (
    df['boreholes_clean']['botm_lease_number']
    .str.lstrip()
)

#%% Include original date that the borehole was drilled
df['boreholes_clean']['api'] = df['boreholes_clean']['api'].astype('str')
df['boreholes_clean']['api_group'] = df['boreholes_clean']['api'].str[:-1]
df['boreholes_clean']['depth_date_original'] = df['boreholes_clean'].loc[
    (df['boreholes_clean']['api'].str[-1] == '0'),
    'depth_date'
]
df['boreholes_clean'] = df['boreholes_clean'].sort_values('api')

df['boreholes_clean']['depth_date_original'] = (
    df['boreholes_clean']
    .groupby('api_group')
    ['depth_date_original']
    .transform(
        lambda x: x.ffill()
    )
)
df['boreholes_clean']['delta_depth_date'] = (
        df['boreholes_clean']['spud_date']
        - df['boreholes_clean']['depth_date_original']
).dt.days
df['boreholes_clean']['deepened_candidate'] = (
        df['boreholes_clean']['delta_depth_date'] > 365)
df['boreholes_clean']['api'] = df['boreholes_clean']['api'].astype('int')

#%% CLEANUP PRODUCTION DATA -------------------------------------------------------------
df_production_by_year = dict()
df_api_interval_by_year = dict()
for i in range(2000, 2016):
    df_api_interval_by_year[i] = df[f'ogora{i}delimit'].drop_duplicates(
        subset=['api', 'production_interval']
    )

    df[f'ogora{i}delimit']['first_production_date'] = pd.to_datetime(
        df[f'ogora{i}delimit']['first_production_date'], format='%Y%m%d'
    )
    df[f'ogora{i}delimit']['production_date'] = pd.to_datetime(
        df[f'ogora{i}delimit']['production_date'], format='%Y%m'
    )
    df[f'ogora{i}delimit'] = df[f'ogora{i}delimit'][
        df[f'ogora{i}delimit']['first_production_date'].dt.year >= 2000
    ]
    df[f'ogora{i}delimit']['first_production_plus_5_years'] = \
        df[f'ogora{i}delimit']['first_production_date'] + pd.Timedelta(days=365*5)
    df[f'ogora{i}delimit'] = \
        df[f'ogora{i}delimit'][
            df[f'ogora{i}delimit']['production_date']
            <= df[f'ogora{i}delimit']['first_production_plus_5_years']
        ]

    df_production_by_year[i] = (
        df[f'ogora{i}delimit']
        .groupby(['api'], as_index=False)
        [['days_on_production', 'monthly_oil_volume', 'monthly_gas_volume']]
        .sum()
    )

df_production = (
    pd.concat(df_production_by_year)
    .groupby('api', as_index=False)
    .sum()
    .rename(
        columns={
            'monthly_oil_volume': 'oil_volume',
            'monthly_gas_volume': 'gas_volume'
        }
    )
)

df_api_interval = (
    pd.concat(df_api_interval_by_year)
    .drop_duplicates(
        subset=['api', 'production_interval']
    )[['api', 'production_interval']]
)

# Cut non-producing interval
df_api_interval = df_api_interval[
    (df_api_interval['production_interval'].str[0] != 'X')
]

#%% GET COMMON API ITERVALS ---------------------------------------------------
df_api_interval['api'] = df_api_interval['api'].astype('str')
df_api_interval['api_group'] = df_api_interval['api'].str[:-1]
df_api_interval['production_interval_original'] = df_api_interval.loc[
    (df_api_interval['api'].str[-1] == '0'),
    'production_interval'
]
df_api_interval = df_api_interval.sort_values('api')

#%% Get the production interval of the initial well
df_api_interval['production_interval_original'] = (
    df_api_interval
    .groupby('api_group')
    ['production_interval_original']
    .transform(
        lambda x: x.ffill()
    )
)
df_api_interval['deepening'] = (
    (
        (df_api_interval['api'].str[-1] != '0')
        & (
            df_api_interval['production_interval'] \
            == df_api_interval['production_interval_original']
        )
    )
)

# Produce a list of the wells that can be classified as 'deepened'
df_deepened = df_api_interval[df_api_interval['deepening'] == True]
df_deepened = df_deepened.reset_index()[['api', 'deepening']]
df_deepened['api'] = df_deepened['api'].astype('int')

#%% MERGE BOREHOLE DATA WITH LEASE DATA -------------------------------------------------
df['boreholes_clean'] = (
    df['boreholes_clean']
    .merge(
        df['lease'],
        on='botm_lease_number',
        how='left'
    )
)

#%% CLEANUP WELL ACTIVITY REPORT DATA ---------------------------------------------------
df['war_clean'] = (
    df['war_main']
    .merge(
        df['war_main_prop'],
        on='sn_war'
    )
    .loc[:, keep_columns_war.keys()]
    .rename(columns=keep_columns_war)
)

#%% GET MUD WEIGHT DATA FROM WAR --------------------------------------------------------
df['war_mud_weight'] = (
    df['war_clean']
    .groupby('api', as_index=False)['drill_fluid_wgt']
    .max()
    .rename(columns={'drill_fluid_wgt': 'max_mud_weight_war'})
)

#%% CLEANUP WAR WITH CASINGS DATA -------------------------------------------------------
df['war_with_casings_clean'] = (
    df['war_main']
    .merge(
        df['war_tubular_summaries'],
        left_on='sn_war',
        right_on='sn_war_fk',
        how='right'
    )
    .merge(
        df['war_tubular_summaries_prop'],
        left_on='sn_war_csng_intv',
        right_on='sn_war_csng_intv_fk',
        how='right'
    )
    .loc[:, keep_columns_war_casings.keys()]
    .rename(columns=keep_columns_war_casings)
    .assign(api=lambda x: x['api'].astype(int))
)

#%% GET NUMBER OF CASING STRINGS FROM CASINGS DATA --------------------------------------
df['war_n_strings'] = (
    df['war_with_casings_clean']
    .drop_duplicates(
        ['api', 'csng_setting_top_md', 'csng_setting_botm_md']
    )
    .groupby(['api'], as_index=False)['sn_war'].agg(len)
    .rename(columns={'sn_war': 'n_strings_war'})
    .assign(api=lambda x: x['api'].astype(int))
    .loc[:, ['api', 'n_strings_war']]
)

#%% COMBINE EWELL AND NON-EWELL PERMITS -------------------------------------------------
df['permits_all'] = (
    df['permits_ewell_clean']
    .append(df['permits_non_ewell_clean'])
    .merge(
        df['boreholes_clean'],
        on='api',
        validate='m:1',
        how='inner'
    )
    .merge(
        df['war_mud_weight'],
        on='api',
        how='left'
    )
    .merge(
        df['war_n_strings'],
        on='api',
        how='left'
    )
)

#%% COMBINE PERMITS WITH PRODUCTION DATA ------------------------------------------------
df['permits_all'] = df['permits_all'].merge(
        df_production,
        on='api',
        how='left'
    ).merge(
        df_deepened,
        on='api',
        how='left'
    )


#%% CUT DOWN TO A SINGLE WELL DATABASE --------------------------------------------------
df['permits_cut'] = (
    df['permits_all']
    .loc[
        (
            (df['permits_all']['spud_date'].dt.year >= 2000)
            & (df['permits_all']['spud_date'].dt.year <= 2015)
            & (df['permits_all']['water_depth'] <= 500)
        )
    ]
    .drop_duplicates(subset=['api', 'rig_name'])
)
df['permits_cut'].to_csv(f"./{data_folder}/temp/02_combine_wells/wells_combined.csv")

#%% COMBINE STRINGS AND MUD WEIGHT ------------------------------------------------------
df['permits_clean'] = (
    df['permits_cut']
    .assign(
        max_mud_weight=df['permits_cut']['max_mud_weight_permit'].fillna(
            df['permits_cut']['max_mud_weight_war']
        ),
        n_strings=df['permits_cut']['n_strings_permit'].fillna(
            df['permits_cut']['n_strings_war']
        )
    )
    .loc[:, keep_columns_permit]
)

#%% IMPUTE THE NUMBER OF STRINGS --------------------------------------------------------
bid_cut = 1980  # Cut off year for the bid data to impute (older = leases more likely
    # to have lots of oil extracted already)

df['permits_clean']['impute_n_strings'] = df['permits_clean']['n_strings'].isnull()
formula = (
    'n_strings '
    '~ max_mud_weight + np.power(max_mud_weight, 2)'
)
mod = (
    smf.ols(
        formula=formula,
        data=df['permits_clean'].query('(impute_n_strings == False)')
    ).fit()
)
df['permits_clean']['n_strings_regression'] = (
    mod.params['Intercept']
    + mod.params['max_mud_weight'] * df['permits_clean']['max_mud_weight']
    + mod.params['np.power(max_mud_weight, 2)']
    * df['permits_clean']['max_mud_weight'] * df['permits_clean']['max_mud_weight']
)
df['permits_clean']['n_strings_fill'] = (
    df['permits_clean'].groupby(['field'])['n_strings'].transform('mean')
)
df['permits_clean']['max_mud_weight_fill'] = (
    df['permits_clean'].groupby(['field'])['max_mud_weight'].transform('mean')
)

df['permits_clean']['effective_date'] = pd.to_datetime(df['permits_clean']['effective_date'])
df['permits_clean']['bid_fill'] = (
    df['permits_clean'][
        (df['permits_clean']['effective_date'].dt.year >= bid_cut)
        & (df['permits_clean']['effective_date'].dt.year <= 2009)
        ].groupby(['field'])['bid'].transform('mean')
)

df['permits_clean']['n_strings'] = df['permits_clean']['n_strings'].fillna(
    df['permits_clean'][df['permits_clean']['spud_date'].dt.year <= 2009]['n_strings_fill'].mean()
)
df['permits_clean']['max_mud_weight'] = df['permits_clean']['max_mud_weight'].fillna(
    df['permits_clean'][df['permits_clean']['spud_date'].dt.year <= 2009]['max_mud_weight_fill'].mean()
)
df['permits_clean'].loc[df['permits_clean']['effective_date'].dt.year < bid_cut, 'bid'] = \
    df['permits_clean'].loc[df['permits_clean']['effective_date'].dt.year >= bid_cut, 'bid_fill']
df['permits_clean'] = df['permits_clean'].dropna(
    subset=[
        'surf_latitude',
        'surf_longitude',
        'botm_latitude',
        'botm_longitude'
    ]
)

df['permits_clean'].to_csv(f"./{data_folder}/temp/02_combine_wells/permits_clean.csv")
